/***************************************************************************************
Firm-embedded productivity and cross-country income differences
Alviarez, Cravino and Ramondo
Journal of Political Economy (2022)

Program: prep_02_forreg_cttesample.do
Date: October 2022

Description: Constructiong bilateral shares at the level of the firm, sin(phi), for firms that appear in ORBIS in every year between 2010 and 2016, as these are arguably the years when the ORBIS data are of the highest quality.

*****************************************************************************************/

clear all
set more off
local var0 sales /*Key variable of interest*/
local varset sales emp exports va labcost /*all relevant variables*/
local types "naics"


*First save the set of firms with information since the year 2010
*-------------------------------------------------------
use "${data}/firm_allyears_sales.dta", clear
keep year isocode id_bvd sales
drop if sales==. | sales==0
reshape wide sales, i( isocode id) j(year)
gen dummy_ctte=(sales2010!=. &  sales2011!=. &  sales2012!=. &  sales2013!=. &  sales2014!=. &  sales2015!=. &  sales2016!=.)
display "$cond_ctte"
keep if $cond_ctte
tempfile sample_constant
save `sample_constant', replace 


etime, start
foreach type in `types' {

display "`type'"

use year isocode id_bvd guo_bvd hq `varset' sector* NAICS* con_code using "${data}/firm_allyears_sales.dta", clear 
merge m:1 isocode id_bvd using `sample_constant', keepusing(id_bvd) 
keep if _merge==3
drop _merge

drop if `var0'==. | `var0'==0
tab con_code
drop if con_code=="$lf"
drop con_code
tab sector


*Choosing the industry level 
*-----------------------------------------------------
if "`type'"=="naics4" {
drop if sector=="NA"
tab sector
local industry NAICS4 
}
if "`type'"=="naics3" {
drop if sector=="NA"
tab sector
local industry NAICS3 
}
if "`type'"=="naics" {
drop if sector=="NA"
tab sector
local industry sector 
}
if "`type'"=="market" {
keep if sector1=="Manufacturing (C)" | sector1=="Market_Services (G-H-I-J-K-M-N-R-S-T)" | sector1=="Other_Goods (A-B-D-E-F)" 
drop sector*
gen sector="Market (MARKT)"
gen sector1=sector
local industry sector 
}
if "`type'"=="MS" {
keep if sector1=="Manufacturing (C)" |  sector1=="Market_Services (G-H-I-J-K-M-N-R-S-T)" 
drop sector*
gen sector="MS"
gen sector1=sector
local industry sector 
}
if "`type'"=="manuf" {
keep if sector1=="Manufacturing (C)" 
drop sector*
gen sector="Manufacturing (C)"
gen sector1=sector
local industry sector 
}
if "`type'"=="serv" {
keep if sector1=="Market_Services (G-H-I-J-K-M-N-R-S-T)" 
drop sector*
gen sector="Market_Services (G-H-I-J-K-M-N-R-S-T)"
gen sector1=sector
local industry sector 
}
*

*Collapse at the level of the parent-industry-isocode (this includes the sales of the GUO-isocode at home as well as in other countries)
*-----------------------------------------------------
sort year isocode sector1 sector `industry' hq guo_bvd
collapse (sum) `varset', by(year isocode sector1 sector `industry' hq guo_bvd)


*Keep only MNCs in the sample based on the number of countries (across ALL SECTORS) 
*----------------------------------------------------- 
by year guo_bvd isocode, sort: gen a=_n==1
tab a
by year guo_bvd, sort: egen num_iso=total(a)
drop a

gen MNC=0
replace MNC=1 if num_iso>1
tab MNC
tab MNC if isocode!=hq
drop if MNC==0 /*For identification an MNCs need to be in at least 2 foreign countries to be in the sample*/


*Rename each variable of interest in the dataset
*----------------------------------------------------- 
foreach vv of local varset {
display "`vv'"
rename `vv' sin_`vv'
}
sort year hq isocode
keep if sin_`var0'!=. & sin_`var0'!=0 
compress



*Bring aggregate Klems-OECD data --- Xn, Xnn, Xn-exp (this files were prepared in 'aggregate_data_sales.do' files ----
*-----------------------------------------------------
merge m:1 year sector isocode using "${data}/klems_oecd_unido_orbis_sales_emp_exp.dta", keepusing(EMP* GO_usd GO_usd_exp VA_usd exports LAB_usd data_source) 
keep if year>=2005 & year<=2017
drop if _merge==2 
drop _merge

*Rename KLEMS/OECD variables  
rename VA_usd yn_va_ko
rename EMPE yn_emp_ko
rename exports yn_exp_ko
rename GO_usd yn_`var0'_ko
rename LAB_usd yn_labcost_ko
rename GO_usd_exp yn_`var0'_exp_ko
format %9.0fc yn_`var0'_ko yn_`var0'_exp_ko yn_exp_ko yn_va_ko yn_labcost_ko

tempfile t0
save `t0', replace


*Keep only the aggregate data and then bring it to the firm level data
*-----------------------------------------------------
use `t0', clear
keep year isocode sector1 sector `industry' yn_`var0'_ko yn_emp_ko  yn_va_ko yn_`var0'_exp_ko  yn_exp_ko yn_labcost_ko
duplicates drop 
collapse (sum) yn_`var0'_ko yn_emp_ko yn_va_ko yn_`var0'_exp_ko  yn_exp_ko yn_labcost_ko, by(year isocode sector1 sector `industry')
tempfile t1
save `t1', replace

use `t0', clear
collapse  (sum) sin_sales sin_emp sin_va sin_exp sin_labcost, by(year guo_bvd isocode hq  sector1 sector `industry')
merge m:1 year isocode sector1 sector `industry' using `t1'
drop _merge

foreach i in sin_sales sin_emp sin_va sin_exp sin_labcost yn_`var0'_ko yn_emp_ko yn_`var0'_exp_ko  yn_exp_ko yn_labcost_ko {
replace `i'=. if `i'==0
}
tempfile final
save `final', replace


*Construct the relevant shares using aggregate KLEMS/OECD
*-----------------------------------------------------
use `final', clear

display "Drop if revenues are below 100,000 USD" 
sum sin_`var0', d
sum sin_`var0' if sin_`var0'<0.1 
drop if sin_`var0'<0.1 

gen sin_emp_ko=sin_emp/yn_emp_ko
gen sin_va_ko=sin_va/yn_va_ko
gen sin_labcost_ko=sin_labcost/yn_labcost_ko
gen sin_`var0'_ko=sin_`var0'/yn_`var0'_ko
gen sin_`var0'_exp_ko=(sin_`var0'-sin_exp)/yn_`var0'_exp_ko

*Drop observations outside the range (only happens for HK, SG and JP)
drop if sin_sales_ko>=1 | sin_sales_ko<=0 | sin_sales_ko==.
replace sin_emp_ko=. if sin_emp_ko>=1 | sin_emp_ko<=0
replace sin_va_ko=. if sin_va_ko>=1 | sin_va_ko<=0
replace sin_labcost_ko=. if sin_labcost_ko>=1 | sin_labcost_ko<=0
replace sin_`var0'_exp_ko=. if sin_`var0'_exp_ko>=1 | sin_`var0'_exp_ko<=0


*Keep only MNC (two countries, which includes two or more affiliates) in a given sector -- notice this is more demanding that the applied restrictions above, but necessary for the sectoral level regressions
*----------------------------------------------
by year sector guo_bvd isocode, sort: gen a=_n==1
by year sector guo_bvd, sort: egen b=total(a)
tab b 
keep if b>=2
drop a b 
format %9.4fc sin* 
format %9.0fc *`var0' yn* 
sort sector1 sector `industry' hq isocode
order sector1 sector `industry' hq isocode sin_`var0'_ko sin_`var0'_exp_ko 


*Create percentiles 
*----------------------------------------------
foreach pos in 10 20 30 40 50 60 70 80 90 {
display `pos'
gsort year isocode sector1 sector `industry' -sin_sales
by year isocode sector1 sector `industry': egen p`pos'_sales = pctile(sin_sales), p(`pos')
format %9.3fc p`pos'_sales
label var p`pos'_sales "pctile `pos' of sin_sales"
}
*


*Number of countries in which the affiliate operates (outside home)
*----------------------------------------------
sort year sector1 `industry' hq guo_bvd isocode
by year sector1 `industry' hq guo_bvd isocode, sort: gen a=_n==1
replace a=0 if isocode==hq
by year sector1 `industry' hq guo_bvd, sort: egen num_ctry=total(a)
drop a
label var num_ctry "number of foreign countries GUO operates within `type'"

tempfile LCS
save `LCS', replace


*Firms in the Largest Connected Set 
*----------------------------------------------
use `LCS', clear
keep year `industry' guo_bvd isocode iso sin_`var0' sin_`var0'_ko yn_`var0'_ko
duplicates drop
sort year isocode guo_bvd
egen iso=group(isocode)
egen guo=group(guo_bvd)
egen sss=group(`industry')
order year `industry' guo_bvd isocode sss guo iso sin_`var0' sin_`var0'_ko yn_`var0'_ko
sort year sss iso guo 
keep if isocode!=""
keep if isocode=="JP" | isocode=="IT" | isocode=="KR" | isocode=="DE" | isocode=="FR" | isocode=="ES" |  isocode=="MX" |  isocode=="GB" |  isocode=="PL" |  isocode=="GR" |  isocode=="NL" |  isocode=="RO" |  isocode=="BE" |  isocode=="AT" |  isocode=="PT" |  isocode=="FI" |  isocode=="SE" |  isocode=="CZ" |  isocode=="SK" |  isocode=="BG" |  isocode=="HU" |  isocode=="HR" |  isocode=="LV" |  isocode=="SI" |  isocode=="LT" |  isocode=="EE" 
save "${output}/LCS_`type'.dta", replace 
keep year sss iso guo sin_`var0' sin_`var0'_ko yn_`var0'_ko
sort year sss iso guo 
order year sss guo iso sin_`var0' sin_`var0'_ko yn_`var0'_ko 
export delimited using "${output}/LCS_`type'", novarnames replace



*Labeling variables of interest 
*----------------------------------------------
use `LCS', clear
label var sin_emp_ko "sin_emp/yn_emp (den KLEMS/OECD)"
label var sin_va_ko "sin_va/yn_va (den KLEMS/OECD)"
label var sin_`var0'_ko "sin_`var0'/yn (den KLEMS/OECD)"
label var sin_`var0'_exp_ko "(sin_`var0'-sin_exp)/yn_`var0'_exp_ko (den KLEMS/OECD/WIOT)"
label var sin_`var0' "`var0' of i MNCs operating in n (cond operating in i)"
label var sin_emp "emp of i MNCs operating in n (cond operating in i)"
label var sin_exp "exports of i MNCs operating in n (cond operating in i)"
label var sin_va "value added of i MNCs operating in n (cond operating in i)"
label var sin_labcost "Labor cost of i MNCs operating in n (cond operating in i)"

label var yn_`var0'_ko "total `var0' in country n"
label var yn_emp_ko "total emp in country n"
label var yn_va_ko "total va in country n"
label var yn_labcost_ko "total labor cost in country n"
label var yn_exp_ko "total exports in country n"
label var yn_`var0'_exp_ko "total `var0' (minus exports) in country n"

compress


*Saving the file
*----------------------------------------------
drop if year==.
order year sector1 sector `industry' hq isocode sin_`var0'_ko sin_`var0'_exp_ko 
compress
tempfile sin_firmlevel_`type'
save `sin_firmlevel_`type'', replace
}
etime 



*Check for outliers
*----------------------------------------------
display "Drop foreing affiliates below 1MM USD and Parents below 5MM USD"
clear all
local whattype naics
use `sin_firmlevel_`whattype'', clear

sum sin_`var0', d
sum sin_`var0' if sin_`var0'<1 
tab isocode if sin_`var0'>=1
display "Drop firms below 1 MILLION USD" 

duplicates drop 
compress 
tempfile sample_firms
save `sample_firms', replace


*===============================================================================
*Temporary check on gravity variables
*===============================================================================
use "${data}/gravdata_00_16.dta", clear
keep if year==2016
replace year=2017
tempfile temp
save `temp', replace

use "${data}/gravdata_00_16.dta", clear
append using `temp'
tempfile gravdata_00_17
save `gravdata_00_17', replace



*Additions: include gravity variables and a dummy of constant MNC across years  
*----------------------------------------------
clear all
set more off

etime, start 
foreach type in `types' {

display "`type'"

if "`type'"=="naics4" {
local industry NAICS4 
}
if "`type'"=="naics3" {
local industry NAICS3 
}
else {
local industry sector
}

*Bring gravity variables 
*-----------------------------------------------
use `sin_firmlevel_`type'', clear

gen iso2_o=hq 
gen iso2_d=isocode
local trade_var distw fta_wto contig comlang_off colony  pop_o pop_d gdp_o gdp_d gdpcap_o gdpcap_d   
merge m:1 iso2_o iso2_d year using `gravdata_00_17', keepusing(`trade_var')

drop if _merge==2
drop _merge
foreach var in iso2_o iso2_d {
encode `var', gen(`var'b)
drop `var'
rename `var'b `var'
}
*
gen distance=ln(distw)
drop distw
drop iso2_o iso2_d
rename fta_wto RTA
label var distance "ln distance" 
label var RTA "Regional Trade Aggrement" 
label var comlang_off "Common Language" 
label var contig "commnon Border" 
label var colony "Former Colony" 
compress


*Flag MNCs number of years it has information (out of yy years)
*-----------------------------------------------
by sector1 sector guo_bvd isocode year, sort: gen a=_n==1
replace a=0 if year<2010
by sector1 sector guo_bvd isocode, sort: egen years_after2010=total(a)
drop a
by sector1 sector guo_bvd isocode year, sort: gen a=_n==1
by sector1 sector guo_bvd isocode, sort: egen years_allperiod=total(a)
drop a
gen sample_const=1
compress
save  "${data}/sin_firmlevel_`type'_gravity${lf}_ctte.dta", replace
}
etime


